{
  "cells": [
    {
      "metadata": {
        "id": "DhWmZgAVwDGu",
        "colab_type": "text"
      },
      "cell_type": "markdown",
      "source": [
        "Copyright 2021 Google LLC\n",
        "\n",
        "Licensed under the Apache License, Version 2.0 (the \"License\");\n",
        "you may not use this file except in compliance with the License.\n",
        "You may obtain a copy of the License at\n",
        "\n",
        "    https://www.apache.org/licenses/LICENSE-2.0\n",
        "\n",
        "Unless required by applicable law or agreed to in writing, software\n",
        "distributed under the License is distributed on an \"AS IS\" BASIS,\n",
        "WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
        "See the License for the specific language governing permissions and\n",
        "limitations under the License."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "EJPjbtcUo-TY"
      },
      "outputs": [],
      "source": [
        "import pandas as pd\n",
        "import math\n",
        "import numpy as np"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "9T0xXF5ClvHt"
      },
      "source": [
        "# Load in Data"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "ntH4kcrplXpP"
      },
      "outputs": [],
      "source": [
        "# PD sequencing counts across experiments\n",
        "from google.colab import files\n",
        "\n",
        "uploaded = files.upload()\n",
        "\n",
        "for fn in uploaded.keys():\n",
        "  print('User uploaded file \"{name}\" with length {length} bytes'.format(\n",
        "      name=fn, length=len(uploaded[fn])))"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "Jkc6xjuzllyj"
      },
      "outputs": [],
      "source": [
        "# Load PD Data\n",
        "with open('pd_clustered_input_data_manuscript.csv') as f:\n",
        "  pd_input_df = pd.read_csv(f)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "bSzb9hoelxVK"
      },
      "source": [
        "# Helper Functions"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "0dx1uLWqj28F"
      },
      "outputs": [],
      "source": [
        "def generate_pos_neg_normalized_ratio(df, col_prefix):\n",
        "  \"\"\"Adds fraction columns to the dataframe with the calculated pos/neg ratio.\n",
        "\n",
        "  Args:\n",
        "    df: (pd.DataFrame) DataFrame expected to have columns [col_prefix]_positive, \n",
        "      [col_prefix]_negative contain read counts for the positive and negative\n",
        "      selection conditions, respectively.\n",
        "    col_prefix: (str) Prefix of the columns to use to calculate the ratio. For \n",
        "    example, 'round1_very_positive'.\n",
        "  \n",
        "  Returns:\n",
        "    (pd.DataFrame) The original dataframe with three new columns:\n",
        "    [col_prefix]_positive_frac contains the fraction of the total positive \n",
        "      pool that is this sequence.\n",
        "    [col_prefix]_negative_frac contains the fraction of the total negative\n",
        "      pool that is this sequence.\n",
        "    [col_prefix]_pos_neg_ratio: The read-depth normalized fraction of the \n",
        "      sequence that ended in the positive pool.\n",
        "  \"\"\"\n",
        "  col_pos = col_prefix + '_' + 'positive'\n",
        "  col_neg = col_prefix + '_' + 'negative'\n",
        "  df[col_pos + '_frac'] = df[col_pos] /  df[col_pos].sum()\n",
        "  df[col_neg + '_frac'] = df[col_neg] /  df[col_neg].sum()\n",
        "  df[col_prefix + '_pos_neg_ratio'] = df[col_pos + '_frac'] / (\n",
        "      df[col_pos + '_frac'] + df[col_neg + '_frac'])\n",
        "  return df"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "NnHhaJdre-Pr"
      },
      "outputs": [],
      "source": [
        "def fraction_to_3bins (frac, min_bin=0.1, max_bin=0.9):\n",
        "  '''Takes a positive / (positive + negative) fraction and converts to ternary.\n",
        "\n",
        "  Args:  \n",
        "    frac: (float) positive / (positive + negative) fraction.\n",
        "    min_bin: (float) Cutoff between bin 0 and bin 1.\n",
        "    max_bin: (float) Cutoff between bin 1 and bin 2.\n",
        "\n",
        "  Returns:\n",
        "    (int) Bin\n",
        "  '''\n",
        "  if math.isnan(frac):\n",
        "    return 0\n",
        "  if frac \u003c min_bin:\n",
        "    return 0\n",
        "  elif frac \u003e max_bin:\n",
        "    return 2\n",
        "  else:\n",
        "    return 1\n",
        "\n",
        "def bins_to_super_bins (low, medium, high):\n",
        "  '''Take the binned labels and convert it to a single SuperBin label.\n",
        "\n",
        "  Args:\n",
        "    low: (int) Bin for low stringency.\n",
        "    medium: (int) Bin for medium stringency.\n",
        "    high: (int) Bin for high strigency.\n",
        "  \n",
        "  Returns:\n",
        "    (int) SuperBin.\n",
        "  '''\n",
        "\n",
        "  if high == 0:\n",
        "    if medium == 0:\n",
        "      if low == 0:\n",
        "        # If all three bins are 0 return 0\n",
        "        return 0\n",
        "      if low == 1:\n",
        "        # Borderline low stringency.\n",
        "        return 1\n",
        "      if low == 2:\n",
        "        # Unambiguous low strigency\n",
        "        return 2\n",
        "    elif medium == 1:\n",
        "      if low == 1:\n",
        "        # If medium and low are 1 return 2 \n",
        "        # The idea is that this added support is similar to low being = 2.\n",
        "        return 2\n",
        "      if low == 2:\n",
        "        # Borderline medium stringency.\n",
        "        return 3\n",
        "    elif medium == 2:\n",
        "      # This is an unambiguous medium stringency.\n",
        "      if low == 2:\n",
        "        return 4\n",
        "  elif high == 1:\n",
        "    # Require that anything in the potentially high bin passes low stringency.\n",
        "    if low == 2:\n",
        "      if medium == 1:\n",
        "        # If medium and and high are borderline this is similar to medium = 2.\n",
        "        return 4\n",
        "      if medium == 2:\n",
        "        # Borderline high strigency.\n",
        "        return 5\n",
        "  elif high == 2 and medium == 2 and low == 2:\n",
        "    # Unambiguous high stringency.\n",
        "    return 6  \n",
        "  # The bins provide an ambiguous story and we need to exclude. \n",
        "  return -1"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "reZZLXLUl0g5"
      },
      "source": [
        "# Create Binned and SuperBin Labels"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "YdnI5FUohabq"
      },
      "outputs": [],
      "source": [
        "# Generate Binned and SuperBin labels as additional columns in dataframe\n",
        "# Binned cols: low_3bins, med_3bins, high_3bins\n",
        "# SuperBin col: super_bin\n",
        "for col_prefix, stringency_level in zip(\n",
        "    ['round2_high_no_serum', 'round2_medium_no_serum', 'round2_low_no_serum'],\n",
        "    ['low', 'med', 'high']):\n",
        "  pd_input_df = generate_pos_neg_normalized_ratio(pd_input_df, col_prefix)\n",
        "  pd_input_df['%s_3bins' %(stringency_level)] = pd_input_df[col_prefix + '_pos_neg_ratio'].apply(fraction_to_3bins)\n",
        "pd_input_df['super_bin'] = pd_input_df.apply(\n",
        "    lambda x: bins_to_super_bins(x.low_3bins, x.med_3bins, x.high_3bins), \n",
        "    axis=1) "
      ]
    }
  ],
  "metadata": {
    "colab": {
      "collapsed_sections": [],
      "last_runtime": {
        "build_target": "//research/biology/alphagenome:alphagenome_colab",
        "kind": "private"
      },
      "name": "create_binned_and_super_bin_labels.ipynb",
      "provenance": [
        {
          "file_id": "1aY9k1_n079p9c4CPGkgjDe0nqX4nfAFH",
          "timestamp": 1603814159117
        }
      ]
    },
    "kernelspec": {
      "display_name": "Python 3",
      "name": "python3"
    }
  },
  "nbformat": 4,
  "nbformat_minor": 0
}
